**if time
## [1] "Last knit to html at: 2015-10-06 18:14:21"
Exercises(0):
ggplot(flr, aes(x = shakeNumber, y = slope)) +
geom_point() +
theme_bw()
# 0.1
setwd("/Users/callinswitzer/Dropbox/Harvard/RByCall/RClassMaterials/Session4")
# 0.2
library(ggplot2)
library(grid) # we need this for the "unit" function
# 0.3
flr <- read.csv("Datasets/CallinPollen.csv")
# 0.4
p1 <- ggplot(flr, aes(x = shakeNumber, y = slope)) +
geom_point() +
theme_bw()
p1
# read in data
crs <- read.csv("Datasets/mtcars.txt", sep = " ")
# plot it
ggplot(crs, aes(x = wt, y = mpg)) +
geom_point()
# color points, based on the number of cylinders
ggplot(crs, aes(x = wt, y = mpg)) +
geom_point(aes(color = cyl))
# same thing
ggplot(crs, aes(x = wt, y = mpg, color = cyl)) +
geom_point()
# we don't want a spectrum of colors, so we'll make crs$cyl into a factor
crs$cyl <- as.factor(crs$cyl)
ggplot(crs, aes(x = wt, y = mpg)) +
geom_point(aes(color = cyl))
# change color scheme
ggplot(crs, aes(x = wt, y = mpg)) +
geom_point(aes(fill= cyl), color = "black", shape = 21, size = 4) +
theme_classic() +
scale_fill_manual(values = c("black", "grey50", "white"))
# color points, based the cyl being greater than 4
Cyl.Gr.4 <- as.numeric(as.character(crs$cyl)) > 4
ggplot(crs, aes(x = wt, y = mpg)) +
geom_point(aes(fill= Cyl.Gr.4), color = "black", shape = 21, size = 4) +
theme_classic() +
scale_fill_manual(values = c("black", "white"))
# size points, based on a factor
ggplot(crs, aes(x = wt, y = mpg)) +
geom_point(aes(size = Cyl.Gr.4), color = "black", fill = "grey", shape = 21) +
theme_classic() +
scale_size_manual(values = c(1, 10))
# you can do the same thing with shape
ggplot(crs, aes(x = wt, y = mpg)) +
geom_point(aes(shape = Cyl.Gr.4), color = "black", fill = "grey", size = 4) +
theme_classic() +
scale_shape_manual(values = c(22, 21), name = "Num. Cyl. \n > 4")
Exercises(1):
ggplot(crs, aes(x = disp, y = mpg)) +
geom_point(aes(color = as.factor(am)), size = 4) +
theme_classic() +
scale_color_brewer(palette = "Paired", name = "Automatic Tranny")
################################ EXAMPLE 1 ################################
# making a boxplot
################################ EXAMPLE 1 ################################
# here's a plot that looks weird as points
ggplot(flr, aes(x = shakeNumber, y = slope)) +
geom_point()
# say we think that shake number should be represented with a boxplot, rather than with dots
p2 <- ggplot(flr, aes(x = shakeNumber, y = slope)) +
geom_boxplot()
p2 # not quite right
p3 <- ggplot(flr, aes(x = as.factor(shakeNumber), y = slope)) +
geom_boxplot()
p3 # much better
# log transform
p3 + scale_y_log10()
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
################################ EXAMPLE 2 ################################
# changing the order of the boxes in a plot
################################ EXAMPLE 2 ################################
# a basic boxplot
p4 <- ggplot(flr, aes(x = trt, y = int)) +
geom_boxplot()
p4
# changing order of the boxes -- change levels of a factor
str(flr$trt) # factor with levels alphabetized
## Factor w/ 9 levels "AC","AC and dehumidifier",..: 4 4 4 4 4 4 4 4 4 4 ...
# we can change the order of the factor levels to change the plot
levels(flr$trt)
## [1] "AC" "AC and dehumidifier"
## [3] "AC set to 62 overnight" "Ambient"
## [5] "dehumidifier" "heat and humidifier"
## [7] "heat and humidifier in alcove" "heat set to 80F overnight"
## [9] "Humidifier"
newlevels <- c('heat and humidifier in alcove', 'heat and humidifier', 'AC', 'AC and dehumidifier', 'AC set to 62 overnight', 'heat set to 80F overnight', 'Ambient', 'dehumidifier', 'Humidifier')
flr$trt <- factor(flr$trt, levels = newlevels)
p5 <- ggplot(flr, aes(x = trt, y = int)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), # make text 45 degrees
plot.margin = unit(c(2,1,1,1), "cm")) # move the left margin over
p5
# we can order the factor levels based on their means
(trtMeans <- tapply(X = flr$int, FUN = mean, INDEX = flr$trt))
## heat and humidifier in alcove heat and humidifier
## 0.05134368 0.06338917
## AC AC and dehumidifier
## 0.14195595 0.09301399
## AC set to 62 overnight heat set to 80F overnight
## 0.12963241 0.05722544
## Ambient dehumidifier
## 0.09773810 0.08195999
## Humidifier
## 0.14091004
# sort the treatments
(trtMeans.sort <- trtMeans[order(trtMeans, decreasing = TRUE)])
## AC Humidifier
## 0.14195595 0.14091004
## AC set to 62 overnight Ambient
## 0.12963241 0.09773810
## AC and dehumidifier dehumidifier
## 0.09301399 0.08195999
## heat and humidifier heat set to 80F overnight
## 0.06338917 0.05722544
## heat and humidifier in alcove
## 0.05134368
names(trtMeans.sort) # sorted vector, based on mean
## [1] "AC" "Humidifier"
## [3] "AC set to 62 overnight" "Ambient"
## [5] "AC and dehumidifier" "dehumidifier"
## [7] "heat and humidifier" "heat set to 80F overnight"
## [9] "heat and humidifier in alcove"
# change the factor levels
flr$trt <- factor(flr$trt, levels = names(trtMeans.sort))
p6.ordered <- ggplot(flr, aes(x = trt, y = int)) +
geom_boxplot(fill = 'black', alpha = 0.5, outlier.colour = NA) + # don't show outliers
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = unit(c(2,1,1,1), "cm")) +
labs(x = "Treatment", y = "Pollen release (integration method)")
p6.ordered
p6.ordered + geom_point() # add points to our plot
p7 <- p6.ordered + geom_point(position = position_jitter(width = 0.1, height = 0), alpha = 0.5)
p7
# save the plot
ggsave("boxpltTrt.pdf", plot = p7, device = cairo_pdf, width = 89, height = 60, unit = "mm")
file.remove("boxpltTrt.pdf") # remove plot
## [1] TRUE
################################ EXAMPLE 3 ################################
# changing the order of the bars in a plot
################################ EXAMPLE 3 ################################
surv <- read.csv("Datasets/survey.csv")
# map values to shorter words
library(plyr)
surv$Rexp <- mapvalues(surv$Rexp, from = as.character(levels(surv$Rexp)), to = c("Instar", "Adult", "Egg", "Pupa"))
ggplot(surv, aes(x = Rexp)) +
geom_bar() +
theme_minimal()
# set new levels
surv$Rexp <- factor(surv$Rexp, levels = c("Egg", "Instar", "Pupa", "Adult"))
# replot
ggplot(surv, aes(x = Rexp)) +
geom_bar() +
theme_minimal()
Exercises(1, cont’):
# makes a new, small data frame that contains only the same levels as the Rexp column
# drops factor levels with 0 items
s.sm <- droplevels(surv[surv$BestExp %in% surv$Rexp, ])
# maps values to one-word, instead of long strings
library(plyr)
s.sm$BestExp <- mapvalues(s.sm$BestExp, from = as.character(levels(s.sm$BestExp)), to = c("Instar", "Adult", "Egg", "Pupa"))
surv <- read.csv("Datasets/survey.csv")
s.sm <- droplevels(surv[surv$BestExp %in% surv$Rexp, ])
library(plyr)
s.sm$BestExp <- mapvalues(s.sm$BestExp, from = as.character(levels(s.sm$BestExp)), to = c("Instar", "Adult", "Egg", "Pupa"))
ggplot(s.sm, aes(x = BestExp)) +
geom_bar() +
theme_minimal()
# change order of bars
s.sm$BestExp <- factor(s.sm$BestExp, levels = c("Egg", "Instar", "Pupa", "Adult"))
# replot
ggplot(s.sm, aes(x = BestExp)) +
geom_bar() +
theme_minimal()
# plot horsepower vs quarter second time
ggplot(crs, aes(x = hp, y = qsec)) +
geom_point()
# add a loess (local regression) line
ggplot(crs, aes(x = hp, y = qsec)) +
geom_point() +
geom_smooth(method = "loess")
# add least squares line, without standard errors
ggplot(crs, aes(x = hp, y = qsec)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# same thing
ggplot(crs, aes(x = hp, y = qsec)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE)
# same thing, specifying formula
ggplot(crs, aes(x = hp, y = qsec)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE, formula = y~log(x) )
Exercises(1, cont’):
ggplot(crs, aes(x = drat, y = hp)) +
geom_point(aes(color = cyl), size = 4) +
geom_smooth(method = "lm", se = FALSE, color = "seashell3", size = 3) +
theme_classic()
################################ EXAMPLE 1 ################################
# for and while loops
################################ EXAMPLE 1 ################################
# for loops repeat a specific number of times
for(i in 1:10){
print(i + 100)
}
## [1] 101
## [1] 102
## [1] 103
## [1] 104
## [1] 105
## [1] 106
## [1] 107
## [1] 108
## [1] 109
## [1] 110
# make fake data
df1 <- data.frame(person = c("bob", "sue", "jack", "ann"), number1 = c(4,5,6,1), number2 = c(2,4,6,6))
df1
## person number1 number2
## 1 bob 4 2
## 2 sue 5 4
## 3 jack 6 6
## 4 ann 1 6
# another for loop
for(ii in df1$person){
print(rep(ii, times = df1[df1$person == ii, "number1"]))
}
## [1] "bob" "bob" "bob" "bob"
## [1] "sue" "sue" "sue" "sue" "sue"
## [1] "jack" "jack" "jack" "jack" "jack" "jack"
## [1] "ann"
# another for loop to make lots of plots
for(foo in c("wt", "hp", "mpg", "qsec", "disp")){
hist(crs[ , foo], main = foo)
rug(crs[ ,foo])
}
## while loops repeat until something happens
x <- 1
while(x < 10){
print(x + 100)
x <- x + 1
}
## [1] 101
## [1] 102
## [1] 103
## [1] 104
## [1] 105
## [1] 106
## [1] 107
## [1] 108
## [1] 109
Exercises(2):
for(i in 1:100){
print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 51
## [1] 52
## [1] 53
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 61
## [1] 62
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
## [1] 69
## [1] 70
## [1] 71
## [1] 72
## [1] 73
## [1] 74
## [1] 75
## [1] 76
## [1] 77
## [1] 78
## [1] 79
## [1] 80
## [1] 81
## [1] 82
## [1] 83
## [1] 84
## [1] 85
## [1] 86
## [1] 87
## [1] 88
## [1] 89
## [1] 90
## [1] 91
## [1] 92
## [1] 93
## [1] 94
## [1] 95
## [1] 96
## [1] 97
## [1] 98
## [1] 99
## [1] 100
################################ EXAMPLE 2 ################################
# if/else
################################ EXAMPLE 2 ################################
# you can use "if" and "else" inside loops to do different things
for(i in 1:10){
if(i > 5) print(i + 100)
else print("hello")
}
## [1] "hello"
## [1] "hello"
## [1] "hello"
## [1] "hello"
## [1] "hello"
## [1] 106
## [1] 107
## [1] 108
## [1] 109
## [1] 110
# same
for(i in 1:10){
if(i > 5){
print(i + 100)
}
else{
print("hello")
}
}
## [1] "hello"
## [1] "hello"
## [1] "hello"
## [1] "hello"
## [1] "hello"
## [1] 106
## [1] 107
## [1] 108
## [1] 109
## [1] 110
# example with data frame
for(nums in 1:length(df1$number1)){
print(nums)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
for(nums in 1:length(df1$number1)){
print(as.character(df1$person[nums]))
}
## [1] "bob"
## [1] "sue"
## [1] "jack"
## [1] "ann"
for(nums in 1:length(df1$number1)){
if(as.character(df1$person[nums]) == "sue"){
print("sue was here")
}
else if(as.character(df1$person[nums]) == "bob"){
print("bob was here")
}
else print("ann or jack")
}
## [1] "bob was here"
## [1] "sue was here"
## [1] "ann or jack"
## [1] "ann or jack"
# example of making a new vector
vec1 <- rep(NA, nrow(df1))
for(nums in 1:length(df1$number1)){
if(as.character(df1$person[nums]) == "sue"){
vec1[nums] <- ("sue was here")
}
else if(as.character(df1$person[nums]) == "bob"){
vec1[nums] <- ("bob was here")
}
else vec1[nums] <- ("ann or jack")
}
vec1
## [1] "bob was here" "sue was here" "ann or jack" "ann or jack"
Exercises(2, cont’):
for(i in 1:100){
if (i < 25 ) print("low")
else if (i >= 25 & i <= 75) print("middle")
else print("high")
}
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "low"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "middle"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
## [1] "high"
for(cols in 1:length(colnames(crs))){
if(is.numeric(crs[, cols])) hist(crs[, cols], main = names(crs[cols]), col = "red")
}
################################ EXAMPLE 2 ################################
# apply functions
################################ EXAMPLE 2 ################################
# here's a simple function
x2 <- function(x) x^2
x2(5)
## [1] 25
# here is a function
trimMean <- function(vect) {
qs <- quantile(vect, probs = c(.10, .90))
mean(vect[vect > qs[1] & vect < qs[2]])
}
nums <- c(1:100, 1000)
nums2 <- c(50:100, 1500, 1)
nums3 <- c(1,3,5,6,7,7,5,4,34,.76,87,8,8,8,6,5,4.4,4,56,7,7,88,33)
# make a new list, from our numbers
listNums <- list(nums, nums2, nums3)
trimMean(nums)
## [1] 51
trimMean(nums2)
## [1] 75
trimMean(nums3)
## [1] 9.317647
# trimMean(listNums) doesn't work
# trimMean(nums, nums2) #doesn't work
trimMean(c(nums, nums2, nums3)) # doesn't give three different numbers
## [1] 54.32624
# we could write a for loop
listMNS <- numeric(length(listNums))
for(i in 1:length(listNums)){
listMNS[i] <- trimMean(listNums[[i]])
}
listMNS
## [1] 51.000000 75.000000 9.317647
# or we could use apply
sapply(listNums, FUN = trimMean)
## [1] 51.000000 75.000000 9.317647
## Apply functions are often faster than loops
# this for loop is slow, because we grow the newVec variable on every iteration of the loop
# note: loop isn't needed here, you could just use 1:30000 + 100 to get a new vector.
system.time({
newVec <- numeric()
for(ii in 1:30000){
newVec[ii] <- ii + 100
}
})
## user system elapsed
## 1.099 0.341 1.442
head(newVec)
## [1] 101 102 103 104 105 106
## sapply function (for illustration only....sapply is not necessary here)
# first, we write a function for what we want to happen
x100 <- function(x) x + 100
# apply that function to a vector
system.time({
newVec.sa <- sapply(X = 1:100000, FUN = x100)
})
## user system elapsed
## 0.063 0.001 0.064
head(newVec.sa)
## [1] 101 102 103 104 105 106
## you can speed up a for-loop by pre-allocating space
system.time({
newVec <- numeric(100000)
for(ii in 1:100000){
newVec[ii] <- ii + 100
}
})
## user system elapsed
## 0.092 0.001 0.093
## you can use apply functions on a data frame
# say you want to calculate the trimmed mean (removing the max and min) for several columns in your data frame
tMean <- function(x){
x <- x[x != min(x, na.rm = TRUE) & x != max(x, na.rm = TRUE)]
return(mean(x, na.rm=TRUE))
}
tMean(c(1:100, 10000, NA, NA))
## [1] 51
mean(c(1:100, 10000, NA), na.rm =TRUE)
## [1] 149.0099
# you could use a for loop
tms <- numeric()
for(ii in c("temp", "humidity", "int", "slope")){
tms[ii] <- tMean(flr[, ii])
}
tms
## temp humidity int slope
## 25.302283105 55.290476190 0.094796074 0.008759418
# or you could use apply
# apply allows you to choose the margin -- I think it converts to a matrix first
apply(X = flr[, c("temp", "humidity", "int", "slope")], MARGIN = 2, FUN = tMean)
## temp humidity int slope
## 25.302283105 55.290476190 0.094796074 0.008759418
# same thing
sapply(X = flr[, c("temp", "humidity", "int", "slope")], FUN = tMean)
## temp humidity int slope
## 25.302283105 55.290476190 0.094796074 0.008759418
Exercises(2, cont’):
maxHighF <- function(thing) {
if(is.numeric(thing)) return(max(thing, na.rm = TRUE))
else if(is.factor(thing)) return(levels(thing)[length(levels(thing))])
else return(NA)
}
maxHighF(crs$hp) # works
## [1] 335
maxHighF(flr$trt) # works
## [1] "heat and humidifier in alcove"
maxHighF(flr[, c("trt", "humidity")]) # doesn't work
## [1] NA
sapply(flr, FUN = maxHighF)
## flowerNum trt
## "332" "heat and humidifier in alcove"
## daysOpen distance
## "4" "12"
## shakeNumber temp
## "8" "33.8"
## humidity notes
## "83" "weird petals"
## dropFromAnalysis file
## "y" "332_30.8_45_7 08062014"
## pxToMmConversion firstPollen
## "36.18223984" "704"
## firstPollenOutOfScreen blankFrames
## "908" "1211"
## slope int
## "0.132515913" "0.385708641"
## aperture focus
## "16" "1:15"
## frameRate shutter
## "1000" "\"1/2000\""
## amplitude freq
## "0.38" "279"
## sec plant
## "0.1792115" "sml001"
## openTime DateOfExp
## "overnight" "8/6/14"
# functions take in something, do something, then output something
# example 1
printNum.plus5 <- function(num){
print(num + 5)
}
printNum.plus5(5)
## [1] 10
printNum.plus5(1000)
## [1] 1005
# example 2
plotWithAxisLabels <- function(x1, y1){
plot(x = x1, y = y1, xlab = "Here is x1", ylab = "Here is y1",
bty = "l", pch = 20)
}
plotWithAxisLabels(x1 = flr$distance, y1 = flr$slope)
# example 3 with sapply
# fake data
df1 <- data.frame(person = c("bob", "sue", "jack", "ann"), number1 = c(4,5,6,1), number2 = c(2,4,6,6))
df1
## person number1 number2
## 1 bob 4 2
## 2 sue 5 4
## 3 jack 6 6
## 4 ann 1 6
# want to convert to long format
longerizer1 <- function(name){
rep(name, times = df1[df1$person == name, "number1"])
}
longerizer1("bob")
## [1] "bob" "bob" "bob" "bob"
df1
## person number1 number2
## 1 bob 4 2
## 2 sue 5 4
## 3 jack 6 6
## 4 ann 1 6
longerizer1("sue")
## [1] "sue" "sue" "sue" "sue" "sue"
longerizer1(c("bob", "sue"))
## [1] "bob" "bob" "bob" "bob" "sue" "sue" "sue" "sue" "sue"
Exercises(3):
pFunct <- function(colNm){
if (is.numeric(flr[, colNm])) plot(y = flr$int, x = flr[, colNm], col = "red")
else if (is.factor(flr[, colNm])) plot(y = flr$int, x = flr[, colNm], col = "blue")
}
par(mfrow = c(1,2))
pFunct(colNm = "trt")
pFunct(colNm = "temp")
# pFunct(colNm = "rrkkr") causes an error
# here's a way to not have the eror crash your script
tryCatch(pFunct(bob), error = function(e) NA)
## [1] NA
Extra Practice:
lob <- read.table("datasets/loblolly.txt", sep = " ", header = TRUE)
head(lob)
## height age Seed
## 1 4.51 3 301
## 2 10.89 5 301
## 3 28.72 10 301
## 4 41.74 15 301
## 5 52.70 20 301
## 6 60.92 25 301
ggplot(lob, aes(x = age, y = height)) +
geom_point()
lob$age <- as.factor(lob$age)
lob$age <- as.factor(lob$age)
ggplot(lob, aes(x = age, y = height)) +
geom_boxplot()
seed <- as.factor(lob$Seed)
ggplot(lob, aes(x = age, y = height)) +
geom_boxplot() +
geom_point(aes(color = seed))
ggplot(lob, aes(x = age, y = height)) +
geom_boxplot() +
geom_point(aes(color = seed)) +
theme_classic()